cars <- read.csv("CarPriceData.csv")
The goal of this project is to use multiple regression to predict car prices. We are going to clean, engineer and remove data from the given cars data found here. We will the go through the remaining data and create visuals and models. Based on the visuals more data will be removed and a model will be created from the remaining data that helps creates a best fit. The model will then be used to predict 20 unknown prices from the data. After the predictions are made they will be submitted in a competition against other models to see which model is the best.
RMSE <- function(predict, obs) {
RMSE <- sqrt(mean((predict - obs)^2, na.rm = TRUE))
return(RMSE)
}
fixMake <- function(make) {
make <- tolower(make)
fixed <- c()
for (val in make) {
switch(val,
"alfa-romero" = {
fixed <- c(fixed, "alfa-romeo")
},
"maxda" = {
fixed <- c(fixed, "mazda")
},
"porcshce" = {
fixed <- c(fixed, "porsche")
},
"toyouta" = {
fixed <- c(fixed, "toyota")
},
"vokswagen" = {
fixed <- c(fixed, "volkswagen")
},
"vw" = {
fixed <- c(fixed, "volkswagen")
},
{
fixed <- c(fixed, val)
}
)
}
return(fixed)
}
buildIntervals <- function(data, model) {
confidence <- as.data.frame(predict.lm(model, newdata = data, interval = "confidence")) %>%
rename(confLwr = lwr, confUpr = upr)
prediction <- as.data.frame(predict.lm(model, newdata = data, interval = "prediction")) %>%
rename(predictLwr = lwr, predictUpr = upr) %>%
select(predictLwr, predictUpr)
intervalData <- cbind(data, confidence, prediction)
return(intervalData)
}
cars2 <- cars %>%
filter(!is.na(price)) %>%
select(
-X,
-wheelbase,
-enginetype,
-fuelsystem,
-symboling,
-stroke,
-peakrpm,
-boreratio,
-doornumber,
-cylindernumber,
-enginelocation
)
enginetype: This can be be explained by enginesize.
wheelbase: The data varies greatly and we choose not to use it to predict price.
fuelsystem: There are factors that vary little to one data point.
symboling: The price is not set by this.
stroke: This can be be explained by enginesize.
peakrpm: This can be explained by horsepower.
cyclindernumber: This can be be explained by enginesize.
boreratio: This can be be explained by enginesize.
doornumber: This is explained by carbody.
enginelocation: There are only three cars in the data set with rear engines and they have other similar traits
X: Same as car_ID
cars3 <- cars2 %>%
separate(CarName, into = c("make", "model"), sep = " ") %>%
mutate(
make = fixMake(make),
projectedVolume = carwidth * carlength * carheight,
avgMPG = (citympg + highwaympg) / 2,
logHP = log10(horsepower),
logPrice = log10(price)
)
make: The manufacturer of the vechicle created by separating CarName. (Categorical)
model: The manufacturer of the vechicle created by separating CarName. (Categorical)
projectedVolume: The projected volume of the car created by $carwidth carlength carheight $. (Numeric)
avgMPG: The average of citympg and highwaymph. (Numeric)
logHp: The number of digits in the horsepower. (Numeric)
logPrice: The number of digits in the price. (Numeric)
cars4 <- cars3 %>%
select(
-model,
-citympg,
-highwaympg,
-carwidth,
-carlength,
-carheight,
-horsepower
) %>%
mutate(
make = as.factor(make),
carbody = as.factor(carbody),
fueltype = as.factor(fueltype)
)
CarName: Split into make and model.
model: This is not important information.
citympg: citympg is now explained in avgMPG
highwaymph: highwaymph is now explained in avgMPG
carwidth: carwidth is now explained in projectedVolume
carlength: carlength is now explained in projectedVolume
carheight: carheight is now explained in projectedVolume
horsepower: horsepower is now explained in logHP
cylindernumber: cylindernumber is now explained in hpPerCyl
model1 <- lm(logPrice ~ enginesize, data = cars4)
df1 <- buildIntervals(cars4, model1)
ggplot(df1, aes(x = enginesize)) +
geom_point(aes(y = price), alpha = 0.5) +
geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75) +
labs(
title = "Figure 1: Size of Car vs Price",
x = bquote("Size of Car" ~ ("in"^3)),
y = "Price ($)",
)
slope1 <- round(summary(model1)$coefficients[2, 1], 3)
intercept1 <- round(summary(model1)$coefficients[1, 1], 3)
Figure 1 shows that the price of the car can be explained by the size of the car’s (enginesize) using the formula \(price = 10^{0.004 \cdot enginesize} \cdot 3250.8729739\).
model2 <- lm(logPrice ~ curbweight, data = cars4)
df2 <- buildIntervals(cars4, model2)
ggplot(df2, aes(x = curbweight)) +
geom_point(aes(y = price), alpha = 0.5) +
geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75) +
labs(
title = "Figure 2: Car Weight vs Price",
x = "Car Weight (lbs)",
y = "Price ($)",
)
slope2 <- round(summary(model2)$coefficients[2, 1], 4)
intercept2 <- round(summary(model2)$coefficients[1, 1], 4)
Figure 2 shows that the price of the car can also be explained by the weight of the car (curbweight) using the formula \(price = 10^{0.0004 \cdot curbweight} \cdot 1250.2590302\).
model3 <- lm(curbweight ~ enginesize, data = cars4)
df3 <- buildIntervals(cars4, model3)
ggplot(df3, aes(x = enginesize)) +
geom_point(aes(y = curbweight), alpha = .75) +
geom_smooth(aes(y = curbweight), method = lm, fill = "yellow") +
geom_line(aes(y = confLwr), linetype = "dashed") +
geom_line(aes(y = confUpr), linetype = "dashed") +
geom_line(aes(y = predictLwr), linetype = "dashed", color = "red") +
geom_line(aes(y = predictUpr), linetype = "dashed", color = "red") +
labs(
title = "Figure 3: Engine Size vs Weight of Car",
x = "Weight of Car (lbs)",
y = "Engine Size (CID)",
)
slope3 <- round(summary(model3)$coefficients[2, 1], 3)
intercept3 <- round(summary(model3)$coefficients[1, 1], 3)
Figure 3 shows that the size of the car’s (enginesize) can be explained by the weight of the car (curbweight) using the formula \(enginesize = 10.476 \cdot curbweight + 1221.953\). This means that curbweight and enginesize are colinear and only one should be used in the final model. Since Figure 3 show that price is better explained by curbweight; enginesize will be thrown out of the data later on in favor of curbweight.
model4 <- lm(logPrice ~ logHP, data = cars4)
df4 <- buildIntervals(cars4, model4)
ggplot(df4, aes(x = logHP)) +
geom_point(aes(y = price), alpha = 0.5) +
geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75) +
labs(
title = "Figure 4: Horsepower vs Price",
x = "Log Horsepower",
y = "Price ($)",
)
slope4 <- round(summary(model3)$coefficients[2, 1], 3)
intercept4 <- round(summary(model3)$coefficients[1, 1], 3)
Figure 4 shows that the price of the car can also be explained by log of horsepower (logHP) using the formula \(price = 10.476 \cdot 10^{logHP} + 1221.953\).
ggplot(df4, aes(x = fct_reorder(make, price, .fun = mean), y = price)) +
geom_bar(stat = "summary", fun = "mean", fill = "#00bfc4") +
labs(
title = "Figure 5: Car Make vs Average Price",
x = "Car Make",
y = "Average price",
) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Figure 5 show the average price varies greatly based on the make of the car. This means when that make should be considered in our final model.
ggplot(cars4, aes(x = fct_reorder(carbody, price, .fun = mean), y = price)) +
geom_bar(stat = "summary", fun = "mean", fill = "#c77cff") +
labs(
title = "Figure 6: Car Body vs Average Price",
x = "Car Body",
y = "Average Price ($)",
) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Figure 6 show the average price varies based on the body of the car (carbody). This means when that carbody could be considered in our final model.
ggplot(cars4, aes(x = fct_reorder(fueltype, price, .fun = mean), y = price, fill = fueltype)) +
geom_bar(stat = "summary", fun = "mean", show.legend = FALSE) +
labs(
title = "Figure 7: Fuel Type vs Price",
x = "Fuel Type",
y = "Average Price ($)",
) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Figure 7 show the average price varies based on the fuel type of the car (fueltype). This means when that fueltype could be considered in our final model.
cars5 <- cars4 %>%
select(-enginesize)
testModel <- lm(logPrice ~ . - car_ID - price, data = cars5)
bestSelect <- ols_step_best_subset(testModel)
To start off we dropped the enginesize data because of our observation in Figure 3. After dropping enginesize we can create a model that includes all the data we did not remove. This model excludes car_ID and price. Now a ols_step_best_subset can be ran on the model. Now an ols_step_best_subset will create subset models of all our variables and output the best model for each number of variables in the formula. The looking at the result we can determined that the best number of are five; the variables being listed below.
finalModel <- lm(logPrice ~ curbweight + logHP + make + carbody + fueltype, data = cars5)
finalSummary <- summary(finalModel)
coefficients <- round(finalSummary$coefficients[, 1], 4)
numCoe <- length(coefficients)
After finding these variables we can create a model for them. Using the newly created model we can find the following function:
\[ price = 2.819032 \cdot horsepower + 10^{0.0002 \cdot {curbweight}} \\~\\ \cdot 10^{0.0839 \cdot 1_{make = Audi}} \cdot 10^{0.1768 \cdot 1_{make = BMW}} \\~\\ \cdot 10^{0.0974 \cdot 1_{make = Buick}} \cdot 10^{-0.0351 \cdot 1_{make = Chevrolet}} \\~\\ \cdot 10^{-0.0658 \cdot 1_{make = Dodge}} \cdot 10^{-0.0295 \cdot 1_{make = Honda}} \\~\\ \cdot 10^{-0.0121 \cdot 1_{make = Isuzu}} \cdot 10^{0.0146 \cdot 1_{make = Jaguar}} \\~\\ \cdot 10^{0.0084 \cdot 1_{make = Mazda}} \cdot 10^{-0.0291 \cdot 1_{make = Mercury}} \\~\\ \cdot 10^{-0.0924 \cdot 1_{make = Mitsubishi}} \cdot 10^{-0.0427 \cdot 1_{make = Nissan}} \\~\\ \cdot 10^{-0.0465 \cdot 1_{make = Peugeot}} \cdot 10^{-0.0773 \cdot 1_{make = Plymouth= yes}} \\~\\ \cdot 10^{0.1902 \cdot 1_{make = Porsche= yes}} \cdot 10^{-0.0719 \cdot 1_{make = Renault= yes}} \\~\\ \cdot 10^{0.0246 \cdot 1_{make = Saab}} \cdot 10^{-0.0761 \cdot 1_{make = Subaru}} \\~\\ \cdot 10^{-0.0544 \cdot 1_{make = Toyota}} \cdot 10^{-0.0141 \cdot 1_{make = Volkswagen}} \\~\\ \cdot 10^{0.0238 \cdot 1_{make = Volvo}} \cdot 10^{-0.0676 \cdot 1_{carbody = hardtop}} \\~\\ \cdot 10^{-0.0978 \cdot 1_{carbody = hatchback}} \cdot 10^{-0.0783 \cdot 1_{carbody = sedan}} \\~\\ \cdot 10^{-0.106 \cdot 1_{carbody = wagon}} \cdot 10^{-0.052 \cdot 1_{fueltype = gas}} \\~\\ \cdot 10^{2.7858}\]
minWeight <- min(cars$curbweight)
maxWeight <- max(cars$curbweight)
weightExample <- round((10^coefficients[2])^2250, 4)
meanIncrease <- round(mean((10^coefficients[2])^cars$curbweight), 4)
minMake <- round(10^min(coefficients[4:23]), 4)
maxMake <- round(10^max(coefficients[4:23]), 4)
minBody <- round(10^min(coefficients[24:28]), 4)
maxBody <- round(10^max(coefficients[24:28]), 4)
Let’s take a deeper look at each variable affects the model.
horsepower: Every time the horsepower increases by \(1\) the price increases by \(2.819032\).
curbweight: The price of the car is multiplied by \(1.0005^{curbweight}\). For example if you have car that weights \(2250\) lbs then the price multiplied by \(2.8184\); because cars weight range from \(1488\) to \(4066\) the average amount the price is multiplied by is \(3.3435\). The is important because curbweight is about tripling the price on average.
make: The priceof increases or decreases based on the make. It does this by multiplying the price by \(0.8084\) a the lowest and \(1.5495\) at the highest. make is important because it can greatly lower the price or slightly increase the price.
carbody: Multiplies price by \(0.7834\) a the lowest and \(1.0563\) at the highest.
fueltype: If the fueltype is gas then the price is multiply by \(0.8872\).
Now that we understand the model a little more let take a look at how well our model fits the data.
corCoff <- round(cor(cars5$price, 10^finalModel$fitted.values), 4)
carRMSE <- round(RMSE(10^finalModel$fitted.values, cars5$price), 4)
The correlation coefficient and RMSE look pretty good so lets take a look at the Residuals vs Fitted of the model and the Q-Q plot of the model.
autoplot(finalModel,1:2)
Looking at the Residuals vs Fitted above the data look pretty random which is good, but the Q-Q shows that the model is pretty good towards the mean price, but it not great at estimating Very High or Very Low prices.
Now that we create a model we can use it to predict the prices for the cars without values.
cars6 <- cars %>%
filter(is.na(price)) %>%
select(
-X,
-wheelbase,
-enginetype,
-fuelsystem,
-symboling,
-stroke,
-peakrpm,
-boreratio,
-doornumber,
-cylindernumber,
-enginelocation
) %>%
separate(CarName, into = c("make", "model"), sep = " ") %>%
mutate(
make = fixMake(make),
projectedVolume = carwidth * carlength * carheight,
avgMPG = (citympg + highwaympg) / 2,
logHP = log10(horsepower),
logPrice = log10(price)
) %>%
select(
-model,
-citympg,
-highwaympg,
-carwidth,
-carlength,
-carheight,
-horsepower
)
cars7 <- buildIntervals(cars6, finalModel) %>%
select(car_ID, fit, predictLwr, predictUpr) %>%
mutate(fit = 10^fit, predictLwr = 10^predictLwr, predictUpr = 10^predictUpr) %>%
rename(price = fit, lower = predictLwr, upper = predictUpr)
kable(cars7)
| car_ID | price | lower | upper |
|---|---|---|---|
| 14 | 21925.112 | 16814.779 | 28588.575 |
| 28 | 8838.829 | 6767.418 | 11544.271 |
| 62 | 10072.907 | 7788.371 | 13027.558 |
| 70 | 30075.819 | 22600.031 | 40024.499 |
| 79 | 6058.387 | 4662.255 | 7872.596 |
| 80 | 7775.771 | 5985.254 | 10101.928 |
| 90 | 6771.282 | 5222.411 | 8779.518 |
| 98 | 6816.032 | 5226.726 | 8888.604 |
| 101 | 9606.346 | 7420.542 | 12436.002 |
| 103 | 17699.608 | 13547.104 | 23124.952 |
| 111 | 17060.152 | 13035.431 | 22327.514 |
| 116 | 13754.299 | 10548.827 | 17933.818 |
| 126 | 23450.790 | 17532.957 | 31366.047 |
| 132 | 8951.273 | 6257.751 | 12804.166 |
| 133 | 13442.702 | 10161.954 | 17782.628 |
| 137 | 17088.992 | 12896.470 | 22644.464 |
| 144 | 8930.717 | 6874.070 | 11602.692 |
| 150 | 10464.163 | 8024.122 | 13646.192 |
| 166 | 9801.112 | 7554.124 | 12716.471 |
| 184 | 9250.621 | 7123.241 | 12013.351 |